{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pytorch\n",
    "import matplotlib\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# load data\n",
    "data_train = pd.read_csv('../data/engineered_data_train.csv')\n",
    "data_test = pd.read_csv('../data/engineered_data_test.csv')\n",
    "\n",
    "X_train = data_train.drop('Y', axis=1)\n",
    "y_train = data_train['Y']\n",
    "\n",
    "X_test = data_test.drop('Y', axis=1)\n",
    "y_test = data_test['Y']\n",
    "print(True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KNN Model "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CROSS VALIDATION & GRID SEARCH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 : 0.790067634766308\n",
      "2 : 0.799949696385279\n",
      "3 : 0.8102074402824821\n",
      "4 : 0.8185367452101654\n",
      "5 : 0.8231966642803162\n",
      "6 : 0.8269039329187347\n",
      "7 : 0.8306434352218048\n",
      "8 : 0.8318732975768349\n",
      "9 : 0.8335690836322209\n",
      "10 : 0.8353125220693678\n",
      "11 : 0.8364407645266765\n",
      "12 : 0.8381634922124003\n",
      "13 : 0.839728425798413\n",
      "14 : 0.840635353695621\n",
      "15 : 0.8413776547907398\n",
      "16 : 0.841859362314439\n",
      "17 : 0.8433391459048204\n",
      "18 : 0.8441644585162004\n",
      "19 : 0.8455338115133691\n",
      "20 : 0.8464895834027939\n",
      "21 : 0.8469433342205817\n",
      "22 : 0.8475840114058224\n",
      "23 : 0.8486364722527326\n",
      "24 : 0.8493177604852231\n",
      "25 : 0.8523067688697714\n",
      "26 : 0.8533546197489588\n",
      "27 : 0.8548266919747074\n",
      "28 : 0.8556637481679191\n",
      "29 : 0.8580406740060081\n"
     ]
    }
   ],
   "source": [
    "# CROSS VALIDATION & Grid Search \n",
    "from itertools import product\n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn.metrics import auc\n",
    "\n",
    "\n",
    "n_neighbor = [i for i in range(1,30)]\n",
    "algorithm = ['auto', 'ball_tree', 'kd_tree']\n",
    "weights = ['uniform','distance']\n",
    "\n",
    "all_result = []\n",
    "\n",
    "for n in n_neighbor:\n",
    "    clf = KNeighborsClassifier(n_neighbors = n)\n",
    "    clf.fit(X_train,y_train)\n",
    "    y_pred = clf.predict_proba(X_test)[:,1]\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_pred)\n",
    "    auc_score = auc(fpr,tpr)\n",
    "    print('{} : {}'.format(n, auc_score))\n",
    "    all_result += [[n,(fpr,tpr), auc_score]]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 'auto', 'uniform') : 0.790067634766308\n",
      "(1, 'auto', 'distance') : 0.790067634766308\n",
      "(1, 'ball_tree', 'uniform') : 0.790067634766308\n",
      "(1, 'ball_tree', 'distance') : 0.790067634766308\n",
      "(1, 'kd_tree', 'uniform') : 0.790067634766308\n",
      "(1, 'kd_tree', 'distance') : 0.790067634766308\n",
      "(2, 'auto', 'uniform') : 0.799949696385279\n",
      "(2, 'auto', 'distance') : 0.799911075367612\n",
      "(2, 'ball_tree', 'uniform') : 0.799949696385279\n",
      "(2, 'ball_tree', 'distance') : 0.799911075367612\n",
      "(2, 'kd_tree', 'uniform') : 0.799949696385279\n",
      "(2, 'kd_tree', 'distance') : 0.799911075367612\n",
      "(3, 'auto', 'uniform') : 0.8102074402824821\n",
      "(3, 'auto', 'distance') : 0.8101706207230464\n",
      "(3, 'ball_tree', 'uniform') : 0.8102063570002496\n",
      "(3, 'ball_tree', 'distance') : 0.8101695815745344\n",
      "(3, 'kd_tree', 'uniform') : 0.8102074402824821\n",
      "(3, 'kd_tree', 'distance') : 0.8101706207230464\n",
      "(4, 'auto', 'uniform') : 0.8185367452101654\n",
      "(4, 'auto', 'distance') : 0.8185037973816716\n",
      "(4, 'ball_tree', 'uniform') : 0.8185364603470597\n",
      "(4, 'ball_tree', 'distance') : 0.8185034683848454\n",
      "(4, 'kd_tree', 'uniform') : 0.8185367452101654\n",
      "(4, 'kd_tree', 'distance') : 0.8185037973816716\n"
     ]
    }
   ],
   "source": [
    "# CROSS VALIDATION & Grid Search \n",
    "from itertools import product\n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn.metrics import auc\n",
    "\n",
    "\n",
    "n_neighbor = [i for i in range(1,5)]\n",
    "algorithm = ['auto', 'ball_tree', 'kd_tree']\n",
    "weights = ['uniform','distance']\n",
    "\n",
    "combs = product(*[n_neighbor,algorithm,weights])\n",
    "all_result = []\n",
    "for params in combs:\n",
    "    clf = KNeighborsClassifier(n_neighbors = params[0], algorithm= params[1], weights= params[2])\n",
    "    clf.fit(X_train,y_train)\n",
    "    y_pred = clf.predict_proba(X_test)[:,1]\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_pred)\n",
    "    auc_score = auc(fpr,tpr)\n",
    "    print('{} : {}'.format(params, auc_score))\n",
    "    all_result += [(params, (fpr,tpr), auc_score)]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/honggong/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2052: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n",
      "  warnings.warn(CV_WARNING, FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "# CROSS VALIDATION & Grid Search \n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn.metrics import auc\n",
    "\n",
    "X_tra,X_te,y_tra, y_te = train_test_split(X_train, y_train, test_size=0.3, random_state=12323)\n",
    "\n",
    "# we test on different neighbors, weights, and algorithm\n",
    "parameters = {'n_neighbors':[1], 'weights':['uniform','distance']}\n",
    "\n",
    "classifier = KNeighborsClassifier()\n",
    "\n",
    "scores = ['accuracy', 'roc_auc','f1','recall']\n",
    "\n",
    "clf = GridSearchCV(\n",
    "        classifier, parameters, scoring=scores, refit = False)\n",
    "clf.fit(X_train, y_train)\n",
    "print(clf.cv_results_)\n",
    "print('best params {}'.format(clf.best_params_))\n",
    "\n",
    "y_pred = clf.predcit(X_test)\n",
    "\n",
    "fpr, tpr, _ = roc_curve(y_test, y_pred)\n",
    "auc = auc(fpr,tpr)\n",
    "\n",
    "print(auc)\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
